Image caption generator is a model which generates caption based on the features present in the input image.
The basic working of the project is that the features are extracted from the images using pre-trained VGG16 model and then fed to the LSTM model along with the captions to train. The trained model is then capable of generating captions for any images that are fed to it.
The dataset used here is the FLICKR 8K which consists of around 8091 images along with 5 captions for each images. If you have a powerful system with more than 16 GB RAM and a graphic card with more than 4 GB of memory, you can try to take FLICKR 30K which has around 30,000 images with captions.
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
import keras
import sys, time, os, warnings
import numpy as np
import pandas as pd
from collections import Counter
warnings.filterwarnings("ignore")
print("python {}".format(sys.version))
print("keras version {}".format(keras.__version__)); del keras
print("tensorflow version {}".format(tf.__version__))
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.95 #Using 95% of the available memory of the GPU
config.gpu_options.visible_device_list = "0"
set_session(tf.Session(config=config))
def set_seed(sd=144):
from numpy.random import seed
from tensorflow import set_random_seed
import random as rn
## numpy random seed
seed(sd)
## core python's random number
rn.seed(sd)
## tensor flow's random number
set_random_seed(sd)
## The location of the Flickr8K_ images
dir_Flickr_jpg = "Data/Flickr8k_Dataset/"
## The location of the caption file
dir_Flickr_text = "Data/Flickr8k.token.txt"
jpgs = os.listdir(dir_Flickr_jpg)
print("The number of jpg flies in Flicker8k: {}".format(len(jpgs)))
#Finding the captions for each image.
file = open(dir_Flickr_text,'r', encoding='utf8')
text = file.read()
file.close()
datatxt = []
for line in text.split('\n'):
col = line.split('\t')
if len(col) == 1:
continue
w = col[0].split("#") # Splitting the caption dataset at the required position
datatxt.append(w + [col[1].lower()])
df_txt = pd.DataFrame(datatxt,columns=["filename","index","caption"])
uni_filenames = np.unique(df_txt.filename.values)
print("The number of unique file names : {}".format(len(uni_filenames)))
print("The distribution of the number of captions for each image:")
Counter(Counter(df_txt.filename.values).values())
print(df_txt[:5])
from keras.preprocessing.image import load_img, img_to_array
from IPython.display import display
from PIL import Image
npic = 5 # Displaying 5 images from the dataset
npix = 224
target_size = (npix,npix,3)
count = 1
fig = plt.figure(figsize=(10,20))
for jpgfnm in uni_filenames[-5:]:
filename = dir_Flickr_jpg + '/' + jpgfnm
captions = list(df_txt["caption"].loc[df_txt["filename"]==jpgfnm].values)
image_load = load_img(filename, target_size=target_size)
ax = fig.add_subplot(npic,2,count,xticks=[],yticks=[])
ax.imshow(image_load)
count += 1
ax = fig.add_subplot(npic,2,count)
plt.axis('off')
ax.plot()
ax.set_xlim(0,1)
ax.set_ylim(0,len(captions))
for i, caption in enumerate(captions):
ax.text(0,i,caption,fontsize=20)
count += 1
plt.show()
# Defining a function to calculate the top 3 words in all the captions available for the images
def df_word(df_txt):
vocabulary = []
for txt in df_txt.caption.values:
vocabulary.extend(txt.split())
print('Vocabulary Size: %d' % len(set(vocabulary)))
ct = Counter(vocabulary)
dfword = pd.DataFrame({"word":list(ct.keys()),"count":list(ct.values())})
dfword = dfword.sort_values("count",ascending=False)
dfword = dfword.reset_index()[["word","count"]]
return(dfword)
dfword = df_word(df_txt)
dfword.head(3)
The caption dataset contains punctuations, singular words and numerical values that need to be cleaned before it is fed to the model because uncleaned dataset will not create good captionsfor the images
import string
text_original = "I ate 1000 apples and a banana. I have python v2.7. It's 2:30 pm. Could you buy me iphone7?"
print(text_original)
print("\nRemove punctuations..")
def remove_punctuation(text_original):
text_no_punctuation = text_original.translate(str.maketrans('','',string.punctuation))
return(text_no_punctuation)
text_no_punctuation = remove_punctuation(text_original)
print(text_no_punctuation)
print("\nRemove a single character word..")
def remove_single_character(text):
text_len_more_than1 = ""
for word in text.split():
if len(word) > 1:
text_len_more_than1 += " " + word
return(text_len_more_than1)
text_len_more_than1 = remove_single_character(text_no_punctuation)
print(text_len_more_than1)
print("\nRemove words with numeric values..")
def remove_numeric(text,printTF=False):
text_no_numeric = ""
for word in text.split():
isalpha = word.isalpha()
if printTF:
print(" {:10} : {:}".format(word,isalpha))
if isalpha:
text_no_numeric += " " + word
return(text_no_numeric)
text_no_numeric = remove_numeric(text_len_more_than1,printTF=True)
print(text_no_numeric)
def text_clean(text_original):
text = remove_punctuation(text_original)
text = remove_single_character(text)
text = remove_numeric(text)
return(text)
for i, caption in enumerate(df_txt.caption.values):
newcaption = text_clean(caption)
df_txt["caption"].iloc[i] = newcaption
topn = 50
def plthist(dfsub, title="The top 50 most frequently appearing words"):
plt.figure(figsize=(20,3))
plt.bar(dfsub.index,dfsub["count"])
plt.yticks(fontsize=20)
plt.xticks(dfsub.index,dfsub["word"],rotation=90,fontsize=20)
plt.title(title,fontsize=20)
plt.show()
dfword = df_word(df_txt)
plthist(dfword.iloc[:topn,:],
title="The top 50 most frequently appearing words")
plthist(dfword.iloc[-topn:,:],
title="The least 50 most frequently appearing words")
Start and end sequence has to be added to the tokens so that it is easier to identify the captions for the images as each of them are of different length
from copy import copy
def add_start_end_seq_token(captions):
caps = []
for txt in captions:
txt = 'startseq ' + txt + ' endseq'
caps.append(txt)
return(caps)
df_txt0 = copy(df_txt)
df_txt0["caption"] = add_start_end_seq_token(df_txt["caption"])
df_txt0.head(5)
del df_txt
df_txt0[:5]
from keras.applications import VGG16
modelvgg = VGG16(include_top=True,weights=None)
## load the locally saved weights
modelvgg.load_weights("Data/vgg16_weights_tf_dim_ordering_tf_kernels.h5")
modelvgg.summary()
The last layer of the VGG-16 is excluded here because we are are just using it for extracting the features rather than using for object classification.
from keras import models
modelvgg.layers.pop()
modelvgg = models.Model(inputs=modelvgg.inputs, outputs=modelvgg.layers[-1].output)
## show the deep learning model
modelvgg.summary()
Here the features are extracted from all the images in the dataset. VGG-16 model gives out 4096 features from the input image of size 224 * 224
from keras.preprocessing.image import load_img, img_to_array
from keras.applications.vgg16 import preprocess_input
from collections import OrderedDict
images = OrderedDict()
npix = 224 #image size is fixed at 224 because VGG16 model has been pre-trained to take that size.
target_size = (npix,npix,3)
data = np.zeros((len(jpgs),npix,npix,3))
for i,name in enumerate(jpgs):
# load an image from file
filename = dir_Flickr_jpg + '/' + name
image = load_img(filename, target_size=target_size)
# convert the image pixels to a numpy array
image = img_to_array(image)
nimage = preprocess_input(image)
y_pred = modelvgg.predict(nimage.reshape( (1,) + nimage.shape[:3]))
images[name] = y_pred.flatten()
For this we have to first create a cluster and find which images belong together. Hence PCA is used to reduce the dimensions of the features which we got from VGG-16 festure extraction from 4096 to 2
First the clusters are plotted and few examples are taken from the bunch for displaying
from sklearn.decomposition import PCA
encoder = np.array(list(images.values()))
#print(encoder)
pca = PCA(n_components=2)
#print(pca)
y_pca = pca.fit_transform(encoder)
## some selected pictures that are creating clusters
#these are just to display the related images from the dataset
picked_pic = OrderedDict()
picked_pic["red"] = [2720,4250,4983,5862,4079]
picked_pic["green"] = [2070,3784,7545,4644, 4997]
picked_pic["magenta"] = [6320,3432,1348,7472, 1518]
picked_pic["blue"] = [3901,2168,3465,5285,5328]
picked_pic["yellow"] = [144,1172,4423,4780,4448]
picked_pic["purple"] = [5087]
fig, ax = plt.subplots(figsize=(15,15))
ax.scatter(y_pca[:,0],y_pca[:,1],c="white")
for irow in range(y_pca.shape[0]):
ax.annotate(irow,y_pca[irow,:],color="black",alpha=0.5) #annotate() is used to place text at the location of the point
for color, irows in picked_pic.items():
for irow in irows:
ax.annotate(irow,y_pca[irow,:],color=color)
ax.set_xlabel("pca embedding 1",fontsize=30)
ax.set_ylabel("pca embedding 2",fontsize=30)
plt.show()
## plot of images
fig = plt.figure(figsize=(16,20))
count = 1
for color, irows in picked_pic.items():
for ivec in irows:
name = jpgs[ivec]
filename = dir_Flickr_jpg + '/' + name
image = load_img(filename, target_size=target_size)
ax = fig.add_subplot(len(picked_pic),5,count,
xticks=[],yticks=[])
count += 1
plt.imshow(image)
plt.title("{} ({})".format(ivec,color))
plt.show()
dimages, keepindex = [],[]
# Creating a datframe where only first caption is taken for processing
df_txt0 = df_txt0.loc[df_txt0["index"].values == "0",: ]
for i, fnm in enumerate(df_txt0.filename):
if fnm in images.keys():
dimages.append(images[fnm])
keepindex.append(i)
#fnames are the names of the image files
fnames = df_txt0["filename"].iloc[keepindex].values
#dcaptions are the captions of the images
dcaptions = df_txt0["caption"].iloc[keepindex].values
#dimages are the actual features of the images
dimages = np.array(dimages)
df_txt0[:5]
As the model can't take texts as an input, they need to converted into vectors.
from keras.preprocessing.text import Tokenizer
## the maximum number of words in dictionary
nb_words = 6000
tokenizer = Tokenizer(nb_words=nb_words)
tokenizer.fit_on_texts(dcaptions)
vocab_size = len(tokenizer.word_index) + 1
print("vocabulary size : {}".format(vocab_size))
dtexts = tokenizer.texts_to_sequences(dcaptions)
print(dtexts[:5])
prop_test, prop_val = 0.2, 0.2
N = len(dtexts)
Ntest, Nval = int(N*prop_test), int(N*prop_val)
def split_test_val_train(dtexts,Ntest,Nval):
return(dtexts[:Ntest],
dtexts[Ntest:Ntest+Nval],
dtexts[Ntest+Nval:])
dt_test, dt_val, dt_train = split_test_val_train(dtexts,Ntest,Nval)
di_test, di_val, di_train = split_test_val_train(dimages,Ntest,Nval)
fnm_test,fnm_val, fnm_train = split_test_val_train(fnames,Ntest,Nval)
maxlen = np.max([len(text) for text in dtexts])
print(maxlen)
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
def preprocessing(dtexts,dimages):
N = len(dtexts)
print("# captions/images = {}".format(N))
assert(N==len(dimages)) # using assert to make sure that length of images and captions are always similar
Xtext, Ximage, ytext = [],[],[]
for text,image in zip(dtexts,dimages):
# zip() is used to create a tuple of iteratable items
for i in range(1,len(text)):
in_text, out_text = text[:i], text[i]
in_text = pad_sequences([in_text],maxlen=maxlen).flatten()# using pad sequence to make the length of all captions equal
out_text = to_categorical(out_text,num_classes = vocab_size) # using to_categorical to
Xtext.append(in_text)
Ximage.append(image)
ytext.append(out_text)
Xtext = np.array(Xtext)
Ximage = np.array(Ximage)
ytext = np.array(ytext)
print(" {} {} {}".format(Xtext.shape,Ximage.shape,ytext.shape))
return(Xtext,Ximage,ytext)
Xtext_train, Ximage_train, ytext_train = preprocessing(dt_train,di_train)
Xtext_val, Ximage_val, ytext_val = preprocessing(dt_val,di_val)
# pre-processing is not necessary for testing data
#Xtext_test, Ximage_test, ytext_test = preprocessing(dt_test,di_test)
from keras import layers
from keras.layers import Input, Flatten, Dropout, Activation
from keras.layers.advanced_activations import LeakyReLU, PReLU
print(vocab_size)
## image feature
dim_embedding = 64
input_image = layers.Input(shape=(Ximage_train.shape[1],))
fimage = layers.Dense(256,activation='relu',name="ImageFeature")(input_image)
## sequence model
input_txt = layers.Input(shape=(maxlen,))
ftxt = layers.Embedding(vocab_size,dim_embedding, mask_zero=True)(input_txt)
ftxt = layers.LSTM(256,name="CaptionFeature",return_sequences=True)(ftxt)
#,return_sequences=True
#,activation='relu'
se2 = Dropout(0.04)(ftxt)
ftxt = layers.LSTM(256,name="CaptionFeature2")(se2)
## combined model for decoder
decoder = layers.add([ftxt,fimage])
decoder = layers.Dense(256,activation='relu')(decoder)
output = layers.Dense(vocab_size,activation='softmax')(decoder)
model = models.Model(inputs=[input_image, input_txt],outputs=output)
model.compile(loss='categorical_crossentropy', optimizer='adam')
print(model.summary())
# fit model
from time import time
from keras.callbacks import TensorBoard
tensorboard = TensorBoard(log_dir="logs/{}".format(time()))
#start = time.time()
hist = model.fit([Ximage_train, Xtext_train], ytext_train,
epochs=6, verbose=2,
batch_size=32,
validation_data=([Ximage_val, Xtext_val], ytext_val),callbacks=[tensorboard])
#end = time.time()
#print("TIME TOOK {:3.2f}MIN".format((end - start )/60))
for label in ["loss","val_loss"]:
plt.plot(hist.history[label],label=label)
plt.legend()
plt.xlabel("epochs")
plt.ylabel("loss")
plt.show()
After the model finishes training we can test out its performance on the some of the test images to figure out if the generated captions are good enough. If the generated captions are good enough we can generate the captions for the whole test dataset.
index_word = dict([(index,word) for word, index in tokenizer.word_index.items()])
def predict_caption(image):
'''
image.shape = (1,4462)
'''
in_text = 'startseq'
for iword in range(maxlen):
sequence = tokenizer.texts_to_sequences([in_text])[0]
sequence = pad_sequences([sequence],maxlen)
yhat = model.predict([image,sequence],verbose=0)
yhat = np.argmax(yhat)
newword = index_word[yhat]
in_text += " " + newword
if newword == "endseq":
break
return(in_text)
npic = 5
npix = 224
target_size = (npix,npix,3)
count = 1
fig = plt.figure(figsize=(10,20))
for jpgfnm, image_feature in zip(fnm_test[8:13],di_test[8:13]):
## images
filename = dir_Flickr_jpg + '/' + jpgfnm
image_load = load_img(filename, target_size=target_size)
ax = fig.add_subplot(npic,2,count,xticks=[],yticks=[])
ax.imshow(image_load)
count += 1
## captions
caption = predict_caption(image_feature.reshape(1,len(image_feature)))
ax = fig.add_subplot(npic,2,count)
plt.axis('off')
ax.plot()
ax.set_xlim(0,1)
ax.set_ylim(0,1)
ax.text(0,0.5,caption,fontsize=20)
count += 1
plt.show()
After the model is trained we have to test the models prediction capabilities on test dataset. Traditional accuracy metric can't be used on predictions. For text evaluations we have a metric called as BLEU Score. BLEU stands for Bilingual Evaluation Understudy, it is a score for comparing a candidate text to one or more reference text.
Example:
hypothesis = "I like dog"
hypothesis = hypothesis.split()
reference = "I do like dog"
references = [reference.split()] ## references must be a list containing list.
from nltk.translate.bleu_score import sentence_bleu
print("BLEU={:4.3f}".format(sentence_bleu(references,hypothesis)))
hypothesis2 = "I love dog!".split()
print("BLEU={:4.3f}".format(sentence_bleu(references, hypothesis2)))
index_word = dict([(index,word) for word, index in tokenizer.word_index.items()])
nkeep = 5
pred_good, pred_bad, bleus = [], [], []
count = 0
for jpgfnm, image_feature, tokenized_text in zip(fnm_test,di_test,dt_test):
count += 1
if count % 200 == 0:
print(" {:4.2f}% is done..".format(100*count/float(len(fnm_test))))
caption_true = [ index_word[i] for i in tokenized_text ]
caption_true = caption_true[1:-1] ## remove startreg, and endreg
## captions
caption = predict_caption(image_feature.reshape(1,len(image_feature)))
caption = caption.split()
caption = caption[1:-1]## remove startreg, and endreg
bleu = sentence_bleu([caption_true],caption)
bleus.append(bleu)
if bleu > 0.7 and len(pred_good) < nkeep:
pred_good.append((bleu,jpgfnm,caption_true,caption))
elif bleu < 0.3 and len(pred_bad) < nkeep:
pred_bad.append((bleu,jpgfnm,caption_true,caption))
print("Mean BLEU {:4.3f}".format(np.mean(bleus)))
We can check out some of the generated caption's quality. Some times due to the complex nature of the images the generated captions are not acceptable. Below you would find some examples
def plot_images(pred_bad):
def create_str(caption_true):
strue = ""
for s in caption_true:
strue += " " + s
return(strue)
npix = 224
target_size = (npix,npix,3)
count = 1
fig = plt.figure(figsize=(10,20))
npic = len(pred_bad)
for pb in pred_bad:
bleu,jpgfnm,caption_true,caption = pb
## images
filename = dir_Flickr_jpg + '/' + jpgfnm
image_load = load_img(filename, target_size=target_size)
ax = fig.add_subplot(npic,2,count,xticks=[],yticks=[])
ax.imshow(image_load)
count += 1
caption_true = create_str(caption_true)
caption = create_str(caption)
ax = fig.add_subplot(npic,2,count)
plt.axis('off')
ax.plot()
ax.set_xlim(0,1)
ax.set_ylim(0,1)
ax.text(0,0.7,"true:" + caption_true,fontsize=20)
ax.text(0,0.4,"pred:" + caption,fontsize=20)
ax.text(0,0.1,"BLEU: {}".format(bleu),fontsize=20)
count += 1
plt.show()
print("Bad Caption")
plot_images(pred_bad)
print("Good Caption")
plot_images(pred_good)
The model has been successfully trained to generate the captions as expected for the images. The caption generation has constantly been improved by fine tuning the model with different hyper parameter. Higher BLEU score indicates that the generated captions are very similar to those of the actual caption present on the images. Below you will find a table displaying different BLEU scores obtained by tuning the parameters:

With the help of Tensorboard, we were able to see how different training process had an impact on the model.
The validation loss falls upto 5th epoch and then increases afterwards, while the training loss still continues falling
The following were the major outcomes and aboservations of the training process and testing the model on the test data: